library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)
trainLabeled <- read.delim("~/GitHub/FCA/Data/trainSet.txt")
validLabeled <- read.delim("~/GitHub/FCA/Data/arcene_valid.txt")
wholeArceneSet <- rbind(trainLabeled,validLabeled)
wholeArceneSet$Labels <- 1*(wholeArceneSet$Labels > 0)
wholeArceneSet[,1:ncol(trainLabeled)] <- sapply(wholeArceneSet,as.double)
studyName <- "ARCENE"
dataframe <- wholeArceneSet
outcome <- "Labels"
thro <- 0.8
cexheat = 0.10
TopVariables <- 10
Some libraries
library(psych)
library(whitening)
library("vioplot")
library("rpart")
pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
| rows | col |
|---|---|
| 200 | 10000 |
pander::pander(table(dataframe[,outcome]))
| 0 | 1 |
|---|---|
| 112 | 88 |
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
largeSet <- length(varlist) > 1500
Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns
### Some global cleaning
sdiszero <- apply(dataframe,2,sd) > 1.0e-16
dataframe <- dataframe[,sdiszero]
varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
dataframe <- dataframe[,tokeep]
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
iscontinous <- sapply(apply(dataframe,2,unique),length) >= 5 ## Only variables with enough samples
dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData
numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000
if (!largeSet)
{
hm <- heatMaps(data=dataframeScaled[1:numsub,],
Outcome=outcome,
Scale=TRUE,
hCluster = "row",
xlab="Feature",
ylab="Sample",
srtCol=45,
srtRow=45,
cexCol=cexheat,
cexRow=cexheat
)
par(op)
}
The heat map of the data
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
#cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Original Correlation",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Pearson Correlation|",
xlab="Feature", ylab="Feature")
diag(cormat) <- 0
print(max(abs(cormat)))
}
DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#>
#> V3351 V8144 V5157 V18 V2705 V4513
#> V1 V2 V3 V4 V5 V6
#> 0.77216749 0.02329639 0.38290230 0.57614943 0.79371921 0.40835386
#>
#> Included: 9744 , Uni p: 1.539409e-05 , Base Size: 562 , Rcrit: 0.2901451
#>
#>
1 <R=1.000,thr=0.950>, Top: 294< 20 >..[Fa= 294 ]( 294 , 2802 , 0 ),<|><>Tot Used: 3096 , Added: 2802 , Zero Std: 0 , Max Cor: 1.000
#>
2 <R=1.000,thr=0.950>, Top: 902< 32 >........[Fa= 1183 ]( 889 , 2263 , 294 ),<|><>Tot Used: 4883 , Added: 2263 , Zero Std: 0 , Max Cor: 1.000
#>
3 <R=1.000,thr=0.950>, Top: 805< 19 >.......[Fa= 1971 ]( 795 , 1633 , 1183 ),<|><>Tot Used: 5775 , Added: 1633 , Zero Std: 0 , Max Cor: 1.000
#>
4 <R=1.000,thr=0.950>, Top: 494< 54 >....[Fa= 2451 ]( 488 , 1019 , 1971 ),<|><>Tot Used: 6244 , Added: 1019 , Zero Std: 0 , Max Cor: 1.000
#>
5 <R=1.000,thr=0.950>, Top: 317< 16 >...[Fa= 2761 ]( 313 , 651 , 2451 ),<|><>Tot Used: 6504 , Added: 651 , Zero Std: 0 , Max Cor: 1.000
#>
6 <R=1.000,thr=0.950>, Top: 190< 7 >.[Fa= 2949 ]( 190 , 334 , 2761 ),<|><>Tot Used: 6593 , Added: 334 , Zero Std: 0 , Max Cor: 0.999
#>
7 <R=0.999,thr=0.950>, Top: 106< 7 >.[Fa= 3055 ]( 106 , 176 , 2949 ),<|><>Tot Used: 6635 , Added: 176 , Zero Std: 0 , Max Cor: 0.998
#>
8 <R=0.998,thr=0.950>, Top: 51< 3 >[Fa= 3106 ]( 51 , 82 , 3055 ),<|><>Tot Used: 6645 , Added: 82 , Zero Std: 0 , Max Cor: 0.998
#>
9 <R=0.998,thr=0.950>, Top: 18< 3 >[Fa= 3124 ]( 18 , 34 , 3106 ),<|><>Tot Used: 6648 , Added: 34 , Zero Std: 0 , Max Cor: 0.991
#>
10 <R=0.991,thr=0.950>, Top: 7< 2 >[Fa= 3131 ]( 7 , 8 , 3124 ),<|><>Tot Used: 6650 , Added: 8 , Zero Std: 0 , Max Cor: 0.989
#>
11 <R=0.989,thr=0.950>, Top: 3< 1 >[Fa= 3134 ]( 3 , 3 , 3131 ),<|><>Tot Used: 6650 , Added: 3 , Zero Std: 0 , Max Cor: 0.950
#>
12 <R=0.950,thr=0.900>, Top: 1312< 2 >............[Fa= 3482 ]( 1271 , 1636 , 3134 ),<|><>Tot Used: 6725 , Added: 1636 , Zero Std: 0 , Max Cor: 0.999
#>
13 <R=0.999,thr=0.950>, Top: 227< 1 >..[Fa= 3542 ]( 227 , 227 , 3482 ),<|><>Tot Used: 6725 , Added: 227 , Zero Std: 0 , Max Cor: 0.975
#>
14 <R=0.975,thr=0.950>, Top: 15< 1 >[Fa= 3545 ]( 15 , 15 , 3542 ),<|><>Tot Used: 6725 , Added: 15 , Zero Std: 0 , Max Cor: 0.953
#>
15 <R=0.953,thr=0.950>, Top: 1< 1 >[Fa= 3545 ]( 1 , 1 , 3545 ),<|><>Tot Used: 6725 , Added: 1 , Zero Std: 0 , Max Cor: 0.950
#>
16 <R=0.950,thr=0.900>, Top: 523< 2 >.....[Fa= 3620 ]( 509 , 580 , 3545 ),<|><>Tot Used: 6744 , Added: 580 , Zero Std: 0 , Max Cor: 0.998
#>
17 <R=0.998,thr=0.950>, Top: 84< 1 >[Fa= 3631 ]( 84 , 84 , 3620 ),<|><>Tot Used: 6744 , Added: 84 , Zero Std: 0 , Max Cor: 0.983
#>
18 <R=0.983,thr=0.950>, Top: 10< 1 >[Fa= 3633 ]( 9 , 9 , 3631 ),<|><>Tot Used: 6744 , Added: 9 , Zero Std: 0 , Max Cor: 0.963
#>
19 <R=0.963,thr=0.950>, Top: 1< 1 >[Fa= 3633 ]( 1 , 1 , 3633 ),<|><>Tot Used: 6744 , Added: 1 , Zero Std: 0 , Max Cor: 0.950
#>
20 <R=0.950,thr=0.900>, Top: 146< 2 >.[Fa= 3646 ]( 144 , 159 , 3633 ),<|><>Tot Used: 6754 , Added: 159 , Zero Std: 0 , Max Cor: 0.994
#>
21 <R=0.994,thr=0.950>, Top: 25< 1 >[Fa= 3651 ]( 25 , 25 , 3646 ),<|><>Tot Used: 6754 , Added: 25 , Zero Std: 0 , Max Cor: 0.980
#>
22 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 3652 ]( 1 , 1 , 3651 ),<|><>Tot Used: 6754 , Added: 1 , Zero Std: 0 , Max Cor: 0.949
#>
23 <R=0.949,thr=0.900>, Top: 61< 1 >[Fa= 3654 ]( 56 , 61 , 3652 ),<|><>Tot Used: 6761 , Added: 61 , Zero Std: 0 , Max Cor: 0.997
#>
24 <R=0.997,thr=0.950>, Top: 11< 1 >[Fa= 3656 ]( 11 , 11 , 3654 ),<|><>Tot Used: 6761 , Added: 11 , Zero Std: 0 , Max Cor: 0.949
#>
25 <R=0.949,thr=0.900>, Top: 15< 1 >[Fa= 3656 ]( 15 , 15 , 3656 ),<|><>Tot Used: 6763 , Added: 15 , Zero Std: 0 , Max Cor: 0.982
#>
26 <R=0.982,thr=0.950>, Top: 6< 1 >[Fa= 3658 ]( 6 , 6 , 3656 ),<|><>Tot Used: 6763 , Added: 6 , Zero Std: 0 , Max Cor: 0.943
#>
27 <R=0.943,thr=0.900>, Top: 5< 1 >[Fa= 3660 ]( 5 , 5 , 3658 ),<|><>Tot Used: 6763 , Added: 5 , Zero Std: 0 , Max Cor: 0.900
#>
28 <R=0.900,thr=0.800>, Top: 1178< 1 >...........[Fa= 3850 ]( 1118 , 1470 , 3660 ),<|><>Tot Used: 6796 , Added: 1470 , Zero Std: 0 , Max Cor: 0.994
#>
29 <R=0.994,thr=0.950>, Top: 41< 1 >[Fa= 3864 ]( 41 , 43 , 3850 ),<|><>Tot Used: 6796 , Added: 43 , Zero Std: 0 , Max Cor: 0.950
#>
30 <R=0.950,thr=0.900>, Top: 128< 1 >.[Fa= 3887 ]( 124 , 124 , 3864 ),<|><>Tot Used: 6796 , Added: 124 , Zero Std: 0 , Max Cor: 0.984
#>
31 <R=0.984,thr=0.950>, Top: 3< 1 >[Fa= 3888 ]( 3 , 3 , 3887 ),<|><>Tot Used: 6796 , Added: 3 , Zero Std: 0 , Max Cor: 0.943
#>
32 <R=0.943,thr=0.900>, Top: 5< 1 >[Fa= 3889 ]( 5 , 5 , 3888 ),<|><>Tot Used: 6796 , Added: 5 , Zero Std: 0 , Max Cor: 0.900
#>
33 <R=0.900,thr=0.800>, Top: 398< 1 >...[Fa= 3927 ]( 356 , 418 , 3889 ),<|><>Tot Used: 6804 , Added: 418 , Zero Std: 0 , Max Cor: 0.988
#>
34 <R=0.988,thr=0.950>, Top: 8< 1 >[Fa= 3929 ]( 8 , 8 , 3927 ),<|><>Tot Used: 6804 , Added: 8 , Zero Std: 0 , Max Cor: 0.946
#>
35 <R=0.946,thr=0.900>, Top: 25< 1 >[Fa= 3931 ]( 25 , 25 , 3929 ),<|><>Tot Used: 6804 , Added: 25 , Zero Std: 0 , Max Cor: 0.899
#>
36 <R=0.899,thr=0.800>, Top: 75< 1 >[Fa= 3937 ]( 69 , 82 , 3931 ),<|><>Tot Used: 6806 , Added: 82 , Zero Std: 0 , Max Cor: 0.919
#>
37 <R=0.919,thr=0.900>, Top: 2< 1 >[Fa= 3937 ]( 2 , 2 , 3937 ),<|><>Tot Used: 6806 , Added: 2 , Zero Std: 0 , Max Cor: 0.893
#>
38 <R=0.893,thr=0.800>, Top: 19< 1 >[Fa= 3938 ]( 12 , 12 , 3937 ),<|><>Tot Used: 6806 , Added: 12 , Zero Std: 0 , Max Cor: 0.874
#>
39 <R=0.874,thr=0.800>, Top: 2< 1 >[Fa= 3938 ]( 1 , 1 , 3938 ),<|><>Tot Used: 6806 , Added: 1 , Zero Std: 0 , Max Cor: 0.835
#>
40 <R=0.835,thr=0.800>, Top: 1< 1 >[Fa= 3938 ]( 1 , 1 , 3938 ),<|><>Tot Used: 6806 , Added: 1 , Zero Std: 0 , Max Cor: 0.800
#>
41 <R=0.800,thr=0.800>
#>
[ 41 ], 0.7999029 Decor Dimension: 6806 Nused: 6806 . Cor to Base: 2187 , ABase: 9744 , Outcome Base: 0
#>
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]
pander::pander(sum(apply(dataframe[,varlist],2,var)))
63594442
pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))
6579975
pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))
3.08
pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))
1.61
varratio <- attr(DEdataframe,"VarRatio")
pander::pander(tail(varratio))
| La_V8356 | La_V1834 | La_V2817 | La_V6348 | La_V4680 | La_V6860 |
|---|---|---|---|---|---|
| 3.36e-06 | 3e-06 | 2.66e-06 | 2.49e-06 | 2.34e-06 | 2.22e-06 |
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
UPLTM <- attr(DEdataframe,"UPLTM")
gplots::heatmap.2(1.0*(abs(UPLTM)>0),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
}
Displaying the features associations
par(op)
#if ((ncol(dataframe) < 1000) && (ncol(dataframe) > 10))
#{
# DEdataframeB <- ILAA(dataframe,verbose=TRUE,thr=thro,bootstrap=30)
transform <- attr(DEdataframe,"UPLTM") != 0
tnames <- colnames(transform)
colnames(transform) <- str_remove_all(colnames(transform),"La_")
transform <- abs(transform*cor(dataframe[,rownames(transform)])) # The weights are proportional to the observed correlation
VertexSize <- attr(DEdataframe,"fscore") # The size depends on the variable independence relevance (fscore)
names(VertexSize) <- str_remove_all(names(VertexSize),"La_")
VertexSize <- 10*(VertexSize-min(VertexSize))/(max(VertexSize)-min(VertexSize)) # Normalization
VertexSize <- VertexSize[rownames(transform)]
rsum <- apply(1*(transform !=0),1,sum) + 0.01*VertexSize + 0.001*varratio[tnames]
csum <- apply(1*(transform !=0),2,sum) + 0.01*VertexSize + 0.001*varratio[tnames]
ntop <- min(10,length(rsum))
topfeatures <- unique(c(names(rsum[order(-rsum)])[1:ntop],names(csum[order(-csum)])[1:ntop]))
rtrans <- transform[topfeatures,]
csum <- (apply(1*(rtrans !=0),2,sum) > 1)
rtrans <- rtrans[,csum]
topfeatures <- unique(c(topfeatures,colnames(rtrans)))
print(ncol(transform))
#> [1] 6806
transform <- transform[topfeatures,topfeatures]
print(ncol(transform))
#> [1] 37
if (ncol(transform)>100)
{
csum <- (apply(1*(transform !=0),2,sum) > 1) & (apply(1*(transform !=0),1,sum) > 1)
transform <- transform[csum,csum]
print(ncol(transform))
}
if (ncol(transform) < 150)
{
gplots::heatmap.2(transform,
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Red Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
VertexSize <- VertexSize[colnames(transform)]
gr <- graph_from_adjacency_matrix(transform,mode = "directed",diag = FALSE,weighted=TRUE)
gr$layout <- layout_with_fr
fc <- cluster_optimal(gr)
plot(fc, gr,
edge.width = 2*E(gr)$weight,
vertex.size=VertexSize,
edge.arrow.size=0.5,
edge.arrow.width=0.5,
vertex.label.cex=(0.15+0.05*VertexSize),
vertex.label.dist=0.5 + 0.05*VertexSize,
main="Top Feature Association")
}
par(op)
if (!largeSet)
{
cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Correlation after ILAA",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Pearson Correlation|",
xlab="Feature", ylab="Feature")
par(op)
diag(cormat) <- 0
print(max(abs(cormat)))
}
classes <- unique(dataframe[1:numsub,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
topvars <- univariate_BinEnsemble(dataframe,outcome)
lso <- LASSO_MIN(formula(paste(outcome,"~.")),dataframe,family="binomial")
topvars <- unique(c(names(topvars),lso$selectedfeatures))
pander::pander(head(topvars))
V5005, V4183, V4557, V8368, V86 and V9650
# names(topvars)
#if (nrow(dataframe) < 1000)
#{
datasetframe.umap = umap(scale(dataframe[1:numsub,topvars]),n_components=2)
# datasetframe.umap = umap(dataframe[1:numsub,varlist],n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
#}
varlistcV <- names(varratio[varratio >= 0.025])
topvars <- univariate_BinEnsemble(DEdataframe[,varlistcV],outcome)
lso <- LASSO_MIN(formula(paste(outcome,"~.")),DEdataframe,family="binomial")
topvars <- unique(c(names(topvars),lso$selectedfeatures))
pander::pander(head(topvars))
V5005, La_V3970, La_V8517, La_V2371, V86 and La_V3945
varlistcV <- varlistcV[varlistcV != outcome]
# DEdataframe[,outcome] <- as.numeric(DEdataframe[,outcome])
#if (nrow(dataframe) < 1000)
#{
datasetframe.umap = umap(scale(DEdataframe[1:numsub,topvars]),n_components=2)
# datasetframe.umap = umap(DEdataframe[1:numsub,varlistcV],n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After ILAA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
#}
univarRAW <- uniRankVar(varlist,
paste(outcome,"~1"),
outcome,
dataframe,
rankingTest="AUC")
100 : V100 200 : V201 300 : V302 400 : V402 500 : V504
600 : V606 700 : V707 800 : V807 900 : V907 1000 : V1008
1100 : V1108 1200 : V1209 1300 : V1309 1400 : V1409 1500 : V1509
1600 : V1610 1700 : V1710 1800 : V1810 1900 : V1911 2000 : V2012
2100 : V2113 2200 : V2213 2300 : V2313 2400 : V2417 2500 : V2518
2600 : V2620 2700 : V2722 2800 : V2822 2900 : V2922 3000 : V3023
3100 : V3123 3200 : V3223 3300 : V3326 3400 : V3428 3500 : V3528
3600 : V3629 3700 : V3734 3800 : V3835 3900 : V3935 4000 : V4038
4100 : V4140 4200 : V4243 4300 : V4344 4400 : V4445 4500 : V4547
4600 : V4649 4700 : V4751 4800 : V4853 4900 : V4954 5000 : V5055
5100 : V5156 5200 : V5256 5300 : V5360 5400 : V5462 5500 : V5564
5600 : V5666 5700 : V5768 5800 : V5868 5900 : V5970 6000 : V6070
6100 : V6171 6200 : V6271 6300 : V6372 6400 : V6473 6500 : V6573
6600 : V6675 6700 : V6777 6800 : V6881 6900 : V6983 7000 : V7088
7100 : V7190 7200 : V7291 7300 : V7393 7400 : V7496 7500 : V7597
7600 : V7701 7700 : V7803 7800 : V7904 7900 : V8007 8000 : V8108
8100 : V8209 8200 : V8310 8300 : V8414 8400 : V8516 8500 : V8620
8600 : V8721 8700 : V8822 8800 : V8925 8900 : V9026 9000 : V9128
9100 : V9232 9200 : V9332 9300 : V9433 9400 : V9533 9500 : V9638
9600 : V9739 9700 : V9841 9800 : V9944
univarDe <- uniRankVar(varlistc,
paste(outcome,"~1"),
outcome,
DEdataframe,
rankingTest="AUC",
)
100 : V100 200 : La_V201 300 : V302 400 : La_V402 500 : V504
600 : V606 700 : V707 800 : V807 900 : La_V907 1000 : La_V1008
1100 : La_V1108 1200 : V1209 1300 : La_V1309 1400 : La_V1409 1500 :
La_V1509
1600 : La_V1610 1700 : La_V1710 1800 : La_V1810 1900 : La_V1911 2000 :
La_V2012
2100 : La_V2113 2200 : La_V2213 2300 : V2313 2400 : La_V2417 2500 :
La_V2518
2600 : V2620 2700 : La_V2722 2800 : V2822 2900 : V2922 3000 :
La_V3023
3100 : La_V3123 3200 : La_V3223 3300 : V3326 3400 : V3428 3500 :
La_V3528
3600 : La_V3629 3700 : La_V3734 3800 : La_V3835 3900 : La_V3935 4000 :
La_V4038
4100 : La_V4140 4200 : La_V4243 4300 : V4344 4400 : V4445 4500 :
La_V4547
4600 : La_V4649 4700 : La_V4751 4800 : V4853 4900 : La_V4954 5000 :
La_V5055
5100 : V5156 5200 : La_V5256 5300 : V5360 5400 : La_V5462 5500 :
La_V5564
5600 : La_V5666 5700 : La_V5768 5800 : La_V5868 5900 : La_V5970 6000 :
La_V6070
6100 : La_V6171 6200 : La_V6271 6300 : La_V6372 6400 : La_V6473 6500 :
V6573
6600 : La_V6675 6700 : La_V6777 6800 : V6881 6900 : La_V6983 7000 :
La_V7088
7100 : La_V7190 7200 : V7291 7300 : V7393 7400 : La_V7496 7500 :
La_V7597
7600 : La_V7701 7700 : La_V7803 7800 : La_V7904 7900 : V8007 8000 :
La_V8108
8100 : La_V8209 8200 : La_V8310 8300 : La_V8414 8400 : La_V8516 8500 :
V8620
8600 : La_V8721 8700 : La_V8822 8800 : La_V8925 8900 : La_V9026 9000 :
V9128
9100 : La_V9232 9200 : La_V9332 9300 : La_V9433 9400 : V9533 9500 :
La_V9638
9600 : La_V9739 9700 : La_V9841 9800 : V9944
univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")
##top variables
topvar <- c(1:length(varlist)) <= TopVariables
tableRaw <- univarRAW$orderframe[topvar,univariate_columns]
pander::pander(tableRaw)
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| V5005 | 314.7 | 72.9 | 239 | 83.6 | 0.18466 | 0.772 |
| V4960 | 47.5 | 49.3 | 124 | 97.3 | 0.19534 | 0.751 |
| V2309 | 43.1 | 45.3 | 113 | 89.0 | 0.18665 | 0.751 |
| V8368 | 44.9 | 46.1 | 116 | 90.6 | 0.21091 | 0.751 |
| V312 | 47.2 | 48.0 | 122 | 94.7 | 0.21678 | 0.750 |
| V3365 | 46.3 | 46.9 | 119 | 92.5 | 0.22139 | 0.749 |
| V9617 | 40.9 | 44.7 | 109 | 87.5 | 0.15591 | 0.749 |
| V414 | 47.5 | 50.4 | 125 | 100.4 | 0.16265 | 0.749 |
| V9092 | 33.9 | 63.1 | 124 | 132.6 | 0.00199 | 0.748 |
| V1936 | 316.0 | 79.5 | 243 | 79.2 | 0.28495 | 0.748 |
topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]
pander::pander(finalTable)
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| La_V2371 | -3.197 | 3.18 | 1.090 | 5.65 | 2.49e-04 | 0.787 |
| La_V2185 | 5.475 | 8.88 | -7.028 | 12.34 | 6.82e-04 | 0.785 |
| La_V7665 | 7.621 | 5.97 | 0.828 | 7.29 | 7.07e-01 | 0.784 |
| La_V3945 | 13.980 | 15.99 | -9.315 | 23.91 | 1.61e-05 | 0.780 |
| La_V3970 | -47.139 | 37.20 | -9.887 | 35.41 | 3.03e-01 | 0.775 |
| V5005 | 314.739 | 72.85 | 239.304 | 83.62 | 1.85e-01 | 0.772 |
| La_V8670 | -21.701 | 4.55 | -16.647 | 5.74 | 8.95e-01 | 0.768 |
| La_V8517 | 23.039 | 17.21 | 7.501 | 15.54 | 5.18e-03 | 0.766 |
| La_V3397 | -1.611 | 1.23 | -0.537 | 1.21 | 2.00e-01 | 0.757 |
| La_V4723 | 0.588 | 4.09 | -6.134 | 8.49 | 3.90e-03 | 0.756 |
dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")
pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
| mean | total | fraction |
|---|---|---|
| 3.21 | 6560 | 0.666 |
theCharformulas <- attr(dc,"LatentCharFormulas")
finalTable <- rbind(finalTable,tableRaw[topvar[!(topvar %in% topLAvar)],univariate_columns])
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- theCharformulas[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")
finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
| DecorFormula | caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | RAWAUC | fscores | |
|---|---|---|---|---|---|---|---|---|---|
| La_V2371 | - (0.918)V1620 + V2371 | -3.197 | 3.18 | 1.090 | 5.65 | 2.49e-04 | 0.787 | 0.490 | 0 |
| La_V2185 | + (5.133)V1380 + V2185 - (3.675)V8390 - (2.486)V8743 | 5.475 | 8.88 | -7.028 | 12.34 | 6.82e-04 | 0.785 | 0.592 | -2 |
| La_V7665 | - (0.144)V2562 + V7665 - (0.972)V9179 | 7.621 | 5.97 | 0.828 | 7.29 | 7.07e-01 | 0.784 | 0.514 | -2 |
| La_V3945 | - (0.847)V1711 + V3945 | 13.980 | 15.99 | -9.315 | 23.91 | 1.61e-05 | 0.780 | 0.571 | 3 |
| La_V3970 | + V3970 - (1.111)V9179 | -47.139 | 37.20 | -9.887 | 35.41 | 3.03e-01 | 0.775 | 0.605 | -1 |
| V5005 | NA | 314.739 | 72.85 | 239.304 | 83.62 | 1.85e-01 | 0.772 | 0.772 | 0 |
| V50051 | NA | 314.739 | 72.85 | 239.304 | 83.62 | 1.85e-01 | 0.772 | NA | NA |
| La_V8670 | - (1.319)V2411 + (0.289)V6231 + V8670 | -21.701 | 4.55 | -16.647 | 5.74 | 8.95e-01 | 0.768 | 0.517 | -1 |
| La_V8517 | - (0.947)V7611 + V8517 | 23.039 | 17.21 | 7.501 | 15.54 | 5.18e-03 | 0.766 | 0.615 | 2 |
| La_V3397 | + (0.489)V3001 + V3397 - (0.909)V7611 - (0.584)V8517 | -1.611 | 1.23 | -0.537 | 1.21 | 2.00e-01 | 0.757 | 0.604 | -2 |
| La_V4723 | - (0.121)V2562 - (0.920)V3122 + V4723 | 0.588 | 4.09 | -6.134 | 8.49 | 3.90e-03 | 0.756 | 0.671 | -1 |
| V4960 | NA | 47.511 | 49.25 | 123.696 | 97.33 | 1.95e-01 | 0.751 | 0.751 | NA |
| V2309 | NA | 43.057 | 45.32 | 112.616 | 89.00 | 1.87e-01 | 0.751 | 0.751 | NA |
| V8368 | NA | 44.886 | 46.06 | 116.036 | 90.62 | 2.11e-01 | 0.751 | 0.751 | NA |
| V312 | NA | 47.182 | 48.01 | 121.634 | 94.73 | 2.17e-01 | 0.750 | 0.750 | NA |
| V3365 | NA | 46.261 | 46.93 | 119.054 | 92.52 | 2.21e-01 | 0.749 | 0.749 | NA |
| V9617 | NA | 40.886 | 44.75 | 108.848 | 87.49 | 1.56e-01 | 0.749 | 0.749 | NA |
| V414 | NA | 47.466 | 50.45 | 125.348 | 100.36 | 1.63e-01 | 0.749 | 0.749 | NA |
| V9092 | NA | 33.886 | 63.11 | 123.607 | 132.62 | 1.99e-03 | 0.748 | 0.748 | NA |
| V1936 | NA | 315.955 | 79.48 | 243.062 | 79.21 | 2.85e-01 | 0.748 | 0.748 | NA |
featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,scale. = TRUE) #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous])
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)
#pander::pander(pc$rotation)
PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])
gplots::heatmap.2(abs(PCACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "PCA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
EFAdataframe <- dataframeScaled
if (length(iscontinous) < 2000)
{
topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
if (topred < 2) topred <- 2
uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE) # EFA analysis
predEFA <- predict(uls,dataframeScaled[,iscontinous])
EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous])
EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
gplots::heatmap.2(abs(EFACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "EFA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
}
par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(rawmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
}
pander::pander(table(dataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 109 | 3 |
| 1 | 13 | 75 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.920 | 0.873 | 0.954 |
| 3 | se | 0.852 | 0.761 | 0.919 |
| 4 | sp | 0.973 | 0.924 | 0.994 |
| 6 | diag.or | 209.615 | 57.738 | 761.000 |
par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe[,c(outcome,varlistcV)],control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(IDeAmodel,main="ILAA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(IDeAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
}
pander::pander(table(DEdataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 100 | 12 |
| 1 | 15 | 73 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.865 | 0.810 | 0.909 |
| 3 | se | 0.830 | 0.734 | 0.901 |
| 4 | sp | 0.893 | 0.820 | 0.943 |
| 6 | diag.or | 40.556 | 17.918 | 91.792 |
par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(PCAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}
pander::pander(table(PCAdataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 104 | 8 |
| 1 | 39 | 49 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.765 | 0.700 | 0.822 |
| 3 | se | 0.557 | 0.447 | 0.663 |
| 4 | sp | 0.929 | 0.864 | 0.969 |
| 6 | diag.or | 16.333 | 7.100 | 37.573 |
par(op)
EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(EFAmodel,EFAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(EFAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
}
pander::pander(table(EFAdataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 109 | 3 |
| 1 | 13 | 75 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.920 | 0.873 | 0.954 |
| 3 | se | 0.852 | 0.761 | 0.919 |
| 4 | sp | 0.973 | 0.924 | 0.994 |
| 6 | diag.or | 209.615 | 57.738 | 761.000 |
par(op)